Imports


In [ ]:
# Import pandas and numpy
import pandas as pd
import numpy as np

# Import the classifiers we will be using
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

# Import train/test split function
from sklearn.model_selection import train_test_split

# Import cross validation scorer
from sklearn.model_selection import cross_val_score

# Import ROC AUC scoring function
from sklearn.metrics import roc_auc_score

Read the data

This a breast cancer diagnostic dataset: these features are computed from a digitized image of a fine needle aspirate (FNA) of a breast mass.

"diagnosis" is our target: 0 for benign, 1 for malignant.


In [ ]:
# Read in our dataset, using the parameter 'index_col' to select the index

In [ ]:
# Let's see the header

In [ ]:
# And the shape

In [ ]:
# Assign the features and the target

Train/test split


In [ ]:
# Create the train/test split

Modelling with standard train/test split


In [ ]:
# Choose the Decision Tree model

# Fit the model

# Make the predictions

# Score the predictions

# Print the score

In [ ]:
# Choose the K-Neareast Neighbors model

# Fit the model

# Make the predictions

# Score the predictions

# Print the score

In [ ]:
# Choose the Naive Bayes model

# Fit the model

# Make the predictions

# Score the predictions

# Print the score

In [ ]:
# Choose the Random Forest model

# Fit the model

# Make the predictions

# Score the predictions

# Print the score

Modelling with k-fold cross validation


In [ ]:
# Choose the Decision Tree model

# Fit, predict and score in one step, using cross_val_score()

# Print the scores

# Print the mean score

In [ ]:
# Choose the K-Neareast Neighbors model

# Fit, predict and score in one step, using cross_val_score()

# Print the scores

# Print the mean score

In [ ]:
# Choose the Naive Bayes model

# Fit, predict and score in one step, using cross_val_score()

# Print the scores

# Print the mean score

In [ ]:
# Choose the Random Forest model

# Fit, predict and score in one step, using cross_val_score()

# Print the scores

# Print the mean score

Bonus exercise:

Check the documentation for each of the models you used.

Try different hyperparameters, and see if you can improve the score!

Some ideas:

- Tune the number of neighbors in the K-Nearest Neighbors model.

- Try balancing the class weight, and the maximum depth on the Decision tree and Random forest models.


In [ ]: